import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import xgboost as xgb
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from scipy import stats
from scipy.stats import normaltest
from pmdarima import auto_arima
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import plotly.express as px
import plotly.graph_objects as go
import pmdarima as pm
from sqlalchemy import create_engine, MetaData, Table, select, and_
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
import datetime
import warnings
# Additional settings
pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore") # specify to ignore warning messages
%matplotlib inline
# Define the SQL Server connection parameters
server = 'airborneanalytics.cpdhuvhrm3nv.us-east-1.rds.amazonaws.com'
database = 'airborneanalytics'
username = 'Hidden'
password = 'Hidden'
connection_string = f"mssql+pyodbc://{username}:{password}@{server}/{database}?driver=ODBC+Driver+17+for+SQL+Server"
engine = create_engine(connection_string)
def get_airport_traffic_data(user_origin, sample_data=False):
"""
Retrieve airport traffic data.
Parameters:
- user_origin: Airport code or 'ALL'
- sample_data: Use sample data (True) or live data (False)
Returns:
- daily_ap_cnt_df: DataFrame containing airport traffic data
"""
if sample_data:
# Load sample data and filter for a specific period
daily_ap_cnt_df = pd.read_csv("daily_ap_cnt_df.csv").drop(columns=['Unnamed: 0'])
daily_ap_cnt_df = daily_ap_cnt_df[(daily_ap_cnt_df['FlightDate'] <= '2019-12-31')]
return daily_ap_cnt_df
# Define the view and table names
airport_traffic_table_name = 'Airport_Traffic_2'
holidays_table_name = 'US_Holidays'
# Create a connection
connection = engine.connect()
# Reflect the view and table structures
metadata = MetaData()
airport_traffic_table = Table(airport_traffic_table_name, metadata, autoload=True, autoload_with=engine)
holidays_table = Table(holidays_table_name, metadata, autoload=True, autoload_with=engine)
# Build a select query to return all columns from both tables with the join condition
select_query = select([airport_traffic_table, holidays_table]).select_from(
airport_traffic_table.outerjoin(holidays_table, and_(
airport_traffic_table.c.FlightDate == holidays_table.c.Date,
airport_traffic_table.c.FlightDate <= '2019-12-31'
))
)
# Check if user_origin is not 'ALL', then add the 'Airport' filter
if user_origin != 'ALL':
select_query = select_query.where(airport_traffic_table.c.Airport == user_origin)
# Execute the query and fetch all rows into a Pandas DataFrame
daily_ap_cnt_df = pd.read_sql_query(select_query, connection)
# Close the connection
connection.close()
return daily_ap_cnt_df
def feature_engineering(df):
"""
Perform feature engineering on the input DataFrame.
Parameters:
- df: DataFrame containing raw data
Returns:
- df: DataFrame with additional features
"""
# Ensure 'FlightDate' is in datetime format
df['FlightDate'] = pd.to_datetime(df['FlightDate'])
# Set 'FlightDate' as the index and sort by date
df = df.set_index('FlightDate').drop(columns=['Date'])
df = df.sort_values(by='FlightDate', ascending=True)
# Extract features
df['Month'] = df.index.month
df['DayOfWeek'] = df.index.dayofweek
df['weekofyear'] = df.index.isocalendar().week
df['holiday_bool'] = pd.notnull(df['Holiday']).astype(int)
# Uncomment the line below if you want to one-hot encode the categorical variables
# df = pd.get_dummies(df, columns=['Month', 'Holiday', 'DayOfWeek', 'weekofyear'], prefix=['Month', 'Holiday', 'DayOfWeek', 'weekofyear'], dtype=int)
# Uncomment the line below if you want to convert the dataframe to integer type
# df = df.astype(int)
return df
def plot_daily_flight_trend(df, columns, ap_code):
"""
Plot daily flight trends.
Parameters:
- df: DataFrame containing flight data
- columns: Columns to plot
- ap_code: Airport code
"""
fig = px.line(df, x=df.index, y=columns, title=f"Daily Inbound/Outbound Traffic at {ap_code}", template='seaborn')
# Customize the layout
fig.update_xaxes(title_text='Flight Date', tickangle=45, tickformat="%b %Y", dtick="M1")
fig.update_yaxes(title_text='Flight Count')
fig.update_layout(title_x=0.5, height=600, width=1000, legend_title_text='')
# Display the legend in the top left corner
fig.update_layout(legend=dict(x=0, y=1, traceorder='normal', orientation='h'))
fig.show()
def create_box_plot(dataframe, x_column):
"""
Create a box plot for total airport traffic.
Parameters:
- dataframe: DataFrame containing data
- x_column: Column for the x-axis
"""
fig = px.box(dataframe, x=x_column, y='Total_AirTraffic_count', title=f'Total Airport Traffic by {x_column}', template='seaborn')
# Customize axis labels
fig.update_xaxes(title_text=x_column)
fig.update_yaxes(title_text='Flight Count')
# Center the title above the plot
fig.update_layout(title_x=0.5)
# Show the plot
fig.show()
def decompose_series(pd_series):
"""
Decompose a time series into trend, seasonal, and residual components.
Parameters:
- pd_series: Pandas Series representing a time series
"""
# Ensure the index is in datetime format
pd_series.index = pd.to_datetime(pd_series.index)
# Decompose the series into monthly segments
decomp = sm.tsa.seasonal_decompose(pd_series)
# Create subplots for original, trend, seasonal, and residual components
fig, axes = plt.subplots(4, 1, figsize=(9, 9), sharex=True)
# Plot the original time series
axes[0].plot(pd_series)
axes[0].set_title('Original Time Series')
# Plot the trend
axes[1].plot(decomp.trend)
axes[1].set_title('Trend')
# Plot the seasonal component
axes[2].plot(decomp.seasonal)
axes[2].set_title('Seasonal Component')
# Plot the residual
axes[3].plot(decomp.resid)
axes[3].set_title('Residual')
# Set x-axis limits based on the data
start_date = pd_series.index.min()
end_date = pd_series.index.max()
axes[3].set_xlim(start_date, end_date)
# Set x-axis labels for all months with rotation
axes[3].xaxis.set_major_locator(mdates.MonthLocator())
axes[3].xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
plt.setp(axes[3].xaxis.get_majorticklabels(), rotation=45, ha="right")
# Adjust layout
plt.tight_layout()
# Show the plot
plt.show()
def test_stationarity(timeseries, window=12, cutoff=0.01):
"""
Test the stationarity of a time series.
Parameters:
- timeseries: Pandas Series representing a time series
- window: Rolling window size for mean and standard deviation
- cutoff: P-value threshold for the ADF test
Outputs:
- Displays plots and results of the stationarity test
"""
# Determining rolling statistics
rolmean = timeseries.rolling(window).mean()
rolstd = timeseries.rolling(window).std()
# Create traces for Plotly
trace_original = go.Scatter(x=timeseries.index, y=timeseries, mode='lines', name='Original', line=dict(color='blue'))
trace_mean = go.Scatter(x=rolmean.index, y=rolmean, mode='lines', name='Rolling Mean', line=dict(color='red'))
trace_std = go.Scatter(x=rolstd.index, y=rolstd, mode='lines', name='Rolling Std', line=dict(color='black'))
# Create layout
layout = go.Layout(
title='Rolling Mean & Standard Deviation',
xaxis=dict(title='Date'),
yaxis=dict(title='Value'),
showlegend=True,
title_x=0.5, # Center the title
title_y=0.9 # Adjust the y-coordinate as needed
)
# Create figure
fig = go.Figure(data=[trace_original, trace_mean, trace_std], layout=layout)
# Show the plot
fig.show()
# Perform Dickey-Fuller test:
print('Results of Dickey-Fuller Test:')
adftest = adfuller(timeseries, autolag='AIC', regression='ct')
print("Null Hypothesis: The series has a unit root (non-stationary)")
print("ADF-Statistic:", adftest[0])
print("P-Value:", adftest[1])
print("Number of lags:", adftest[2])
print("Number of observations:", adftest[3])
print("Critical Values:", adftest[4])
print("Note: If P-Value is smaller than 0.05, we reject the null hypothesis and the series is stationary")
def plot_residual_analysis(model):
"""
Plot residual analysis for a time series model.
Parameters:
- model: Time series model object
"""
# Get residuals from the model and convert to NumPy array
resid = np.array(model.resid)
# Perform normality test
normal_test_result = normaltest(resid)
print(f"Normality Test: Chi-squared={normal_test_result.statistic}, p-value={normal_test_result.pvalue}")
# Plot Residual Distribution
fig, (ax0, ax1, ax2) = plt.subplots(3, 1, figsize=(12, 12))
# Plot Residual Distribution
sns.distplot(resid, fit=stats.norm, ax=ax0)
(mu, sigma) = stats.norm.fit(resid)
ax0.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc='best')
ax0.set_ylabel('Frequency')
ax0.set_title('Residual Distribution')
# Plot ACF
sm.graphics.tsa.plot_acf(resid, lags=40, ax=ax1)
ax1.set_title('Autocorrelation Function (ACF)')
# Plot PACF
sm.graphics.tsa.plot_pacf(resid, lags=40, ax=ax2)
ax2.set_title('Partial Autocorrelation Function (PACF)')
plt.tight_layout()
plt.show()
def calculate_first_difference(df, column):
"""
Calculate the first difference of a column in a DataFrame.
Parameters:
- df: DataFrame
- column: Name of the column for which to calculate the first difference
Returns:
- df_diff: DataFrame with the first difference column added
"""
df_diff = df.copy()
#df_diff[column] = df[column] - df[column].shift(1)
df_diff[column] = df[column].diff(periods=1)
return df_diff
def smape_kun(y_true, y_pred):
"""
Calculate Symmetric Mean Absolute Percentage Error (SMAPE).
Parameters:
- y_true: True values
- y_pred: Predicted values
Returns:
- mape: Mean Absolute Percentage Error
- smape: Symmetric Mean Absolute Percentage Error
"""
mape = np.mean(abs((y_true - y_pred) / y_true)) * 100
smape = np.mean((np.abs(y_pred - y_true) * 200 / (np.abs(y_pred) + np.abs(y_true))).fillna(0))
return mape, smape
def calculate_metrics(results_df, test_col='Test', pred_cols=None):
"""
Calculate metrics for model evaluation.
Parameters:
- results_df: DataFrame containing model predictions and true values
- test_col: Column name for true values
- pred_cols: List of column names for predicted values
Returns:
- metrics_df: DataFrame with MAPE and SMAPE for each model
"""
if pred_cols is None:
pred_cols = results_df.columns.difference([test_col])
metrics_df = pd.DataFrame(index=['MAPE', 'SMAPE'])
for model_col in pred_cols:
y_true = results_df[test_col]
y_pred = results_df[model_col]
# Drop rows with NaN values
non_nan_mask = ~np.isnan(y_true) & ~np.isnan(y_pred)
y_true = y_true[non_nan_mask]
y_pred = y_pred[non_nan_mask]
# Calculate metrics
mape = np.mean(abs((y_true - y_pred) / y_true)) * 100
smape = np.mean((np.abs(y_pred - y_true) * 200 / (np.abs(y_pred) + np.abs(y_true))).fillna(0))
metrics_df[model_col] = [mape, smape]
return metrics_df
# Retrieve sample airport traffic data for all airports
ap_cnt_query_df = get_airport_traffic_data('ALL', sample_data=True)
# Perform feature engineering on the sample data
daily_ap_cnt_df = feature_engineering(ap_cnt_query_df)
# Display the first few rows of the processed DataFrame
daily_ap_cnt_df.head()
| Airport | Origin_AirTraffic_count | Dest_AirTraffic_count | Total_AirTraffic_count | Holiday | Month | DayOfWeek | weekofyear | holiday_bool | |
|---|---|---|---|---|---|---|---|---|---|
| FlightDate | |||||||||
| 2018-01-01 | DTW | 322 | 320 | 642 | New Year's Day | 1 | 0 | 1 | 1 |
| 2018-01-02 | DTW | 413 | 419 | 832 | NaN | 1 | 1 | 1 | 0 |
| 2018-01-03 | DTW | 407 | 403 | 810 | NaN | 1 | 2 | 1 | 0 |
| 2018-01-04 | DTW | 406 | 405 | 811 | NaN | 1 | 3 | 1 | 0 |
| 2018-01-05 | DTW | 410 | 408 | 818 | NaN | 1 | 4 | 1 | 0 |
# Select a specific airport for analysis
airport_selection = 'DTW'
daily_ap_cnt_df_filtered = daily_ap_cnt_df[(daily_ap_cnt_df['Airport'] == airport_selection) & (daily_ap_cnt_df.index <= '2019-12-31')]
daily_ap_cnt_df_filtered
| Airport | Origin_AirTraffic_count | Dest_AirTraffic_count | Total_AirTraffic_count | Holiday | Month | DayOfWeek | weekofyear | holiday_bool | |
|---|---|---|---|---|---|---|---|---|---|
| FlightDate | |||||||||
| 2018-01-01 | DTW | 322 | 320 | 642 | New Year's Day | 1 | 0 | 1 | 1 |
| 2018-01-02 | DTW | 413 | 419 | 832 | NaN | 1 | 1 | 1 | 0 |
| 2018-01-03 | DTW | 407 | 403 | 810 | NaN | 1 | 2 | 1 | 0 |
| 2018-01-04 | DTW | 406 | 405 | 811 | NaN | 1 | 3 | 1 | 0 |
| 2018-01-05 | DTW | 410 | 408 | 818 | NaN | 1 | 4 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2019-12-27 | DTW | 453 | 453 | 906 | NaN | 12 | 4 | 52 | 0 |
| 2019-12-28 | DTW | 410 | 410 | 820 | NaN | 12 | 5 | 52 | 0 |
| 2019-12-29 | DTW | 446 | 447 | 893 | NaN | 12 | 6 | 52 | 0 |
| 2019-12-30 | DTW | 454 | 453 | 907 | NaN | 12 | 0 | 1 | 0 |
| 2019-12-31 | DTW | 313 | 315 | 628 | New Year's Eve | 12 | 1 | 1 | 1 |
730 rows × 9 columns
# Define the columns to plot
columns_to_plot = ['Total_AirTraffic_count', 'Origin_AirTraffic_count', 'Dest_AirTraffic_count']
# Plot the daily flight trends for selected columns and airport
plot_daily_flight_trend(daily_ap_cnt_df_filtered, columns_to_plot, airport_selection)
create_box_plot(daily_ap_cnt_df_filtered, 'DayOfWeek')
#create_box_plot(daily_ap_cnt_df_filtered, 'DayofMonth')
create_box_plot(daily_ap_cnt_df_filtered, 'weekofyear')
# Decomp and plot
decompose_series(daily_ap_cnt_df_filtered['Total_AirTraffic_count'])
daily_ap_cnt_df_filtered = pd.get_dummies(daily_ap_cnt_df_filtered, columns = ['Month','Holiday','DayOfWeek','weekofyear'] , prefix = ['Month','Holiday','DayOfWeek','weekofyear'], dtype=int)
test_stationarity(daily_ap_cnt_df_filtered['Total_AirTraffic_count'])
Results of Dickey-Fuller Test:
Null Hypothesis: The series has a unit root (non-stationary)
ADF-Statistic: -2.6495726813210214
P-Value: 0.2576105666695362
Number of lags: 20
Number of observations: 709
Critical Values: {'1%': -3.9715957585172452, '5%': -3.4167004865653943, '10%': -3.1307046974434787}
Note: If P-Value is smaller than 0.05, we reject the null hypothesis and the series is stationary
first_diff_df = calculate_first_difference(daily_ap_cnt_df_filtered, 'Total_AirTraffic_count')
first_diff_series = first_diff_df['Total_AirTraffic_count'].dropna(inplace = False)
test_stationarity(first_diff_series, window = 12)
Results of Dickey-Fuller Test:
Null Hypothesis: The series has a unit root (non-stationary)
ADF-Statistic: -8.678465539174628
P-Value: 1.9552948708891452e-12
Number of lags: 20
Number of observations: 708
Critical Values: {'1%': -3.9716139550505587, '5%': -3.416709284224621, '10%': -3.130709870667292}
Note: If P-Value is smaller than 0.05, we reject the null hypothesis and the series is stationary
After differencing, the p-value is extremely small. Thus this series is very likely to be stationary.
def ACF_PACF_Plot(series, lags):
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(series, lags=lags, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(series, lags=lags, ax=ax2)
ACF_PACF_Plot(first_diff_series, 25) #uncomment to view plot for differenced series
ACF_PACF_Plot(daily_ap_cnt_df_filtered['Total_AirTraffic_count'], 25)
arima_mod = sm.tsa.ARIMA(first_diff_series, order=(3, 1, 1)).fit()
print(arima_mod.summary())
SARIMAX Results
==================================================================================
Dep. Variable: Total_AirTraffic_count No. Observations: 729
Model: ARIMA(3, 1, 1) Log Likelihood -4545.347
Date: Tue, 05 Dec 2023 AIC 9100.694
Time: 11:14:26 BIC 9123.646
Sample: 01-02-2018 HQIC 9109.550
- 12-31-2019
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 -0.4822 0.052 -9.302 0.000 -0.584 -0.381
ar.L2 -0.4755 0.054 -8.844 0.000 -0.581 -0.370
ar.L3 -0.1962 0.059 -3.353 0.001 -0.311 -0.082
ma.L1 -0.9999 0.968 -1.033 0.302 -2.898 0.898
sigma2 1.533e+04 1.45e+04 1.057 0.291 -1.31e+04 4.38e+04
===================================================================================
Ljung-Box (L1) (Q): 0.30 Jarque-Bera (JB): 361.29
Prob(Q): 0.58 Prob(JB): 0.00
Heteroskedasticity (H): 1.02 Skew: -1.57
Prob(H) (two-sided): 0.88 Kurtosis: 4.42
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
To see how our first model perform, we can plot the residual distribution. Our very low Pvalue suggests that the residuals do not follow a normal distribution. Deviations from normality may indicate that the model is missing some important features or that the underlying data has non-Gaussian characteristics.
ACF and PACF plots help assess whether there is any autocorrelation or remaining structure in the residuals. If significant autocorrelation is present at certain lags, it may indicate that the model has not captured all the temporal patterns.
plot_residual_analysis(arima_mod)
Normality Test: Chi-squared=188.66674447253126, p-value=1.0753181556000887e-41
Since our residuals do not follow a normal distribution, we will attempt to use Auto ARIMA to find the optimal p, d, and q values
# Use auto_arima to find the optimal ARIMA parameters
auto_arima_model = auto_arima(daily_ap_cnt_df_filtered['Total_AirTraffic_count'], seasonal=True, suppress_warnings=True, stepwise=True)
# Extract the optimal order
order = auto_arima_model.order
# Fit the ARIMA model with the optimal order
arima_model_auto = sm.tsa.ARIMA(daily_ap_cnt_df_filtered['Total_AirTraffic_count'], order=order).fit()
print(arima_model_auto.summary())
SARIMAX Results
==================================================================================
Dep. Variable: Total_AirTraffic_count No. Observations: 730
Model: ARIMA(5, 1, 4) Log Likelihood -4321.753
Date: Tue, 05 Dec 2023 AIC 8663.506
Time: 11:15:23 BIC 8709.422
Sample: 01-01-2018 HQIC 8681.222
- 12-31-2019
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 0.1998 0.054 3.715 0.000 0.094 0.305
ar.L2 -0.9644 0.043 -22.331 0.000 -1.049 -0.880
ar.L3 -0.0599 0.076 -0.787 0.431 -0.209 0.089
ar.L4 -0.5185 0.043 -12.101 0.000 -0.603 -0.435
ar.L5 -0.5917 0.052 -11.397 0.000 -0.693 -0.490
ma.L1 -0.8127 0.031 -26.074 0.000 -0.874 -0.752
ma.L2 1.3712 0.038 35.824 0.000 1.296 1.446
ma.L3 -0.7726 0.037 -20.853 0.000 -0.845 -0.700
ma.L4 0.9334 0.027 34.850 0.000 0.881 0.986
sigma2 1.233e+04 645.541 19.106 0.000 1.11e+04 1.36e+04
===================================================================================
Ljung-Box (L1) (Q): 3.45 Jarque-Bera (JB): 2520.58
Prob(Q): 0.06 Prob(JB): 0.00
Heteroskedasticity (H): 1.94 Skew: -0.02
Prob(H) (two-sided): 0.00 Kurtosis: 12.11
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
plot_residual_analysis(arima_model_auto)
Normality Test: Chi-squared=170.3100847655441, p-value=1.041441757083328e-37
sarima_mod = sm.tsa.statespace.SARIMAX(first_diff_series,order=(5, 1, 4),seasonal_order=(5, 1, 4, 12),
enforce_invertibility=False,enforce_stationarity=False, trend='n')
sarima_results = sarima_mod.fit()
print(sarima_results.summary())
SARIMAX Results
==========================================================================================
Dep. Variable: Total_AirTraffic_count No. Observations: 729
Model: SARIMAX(5, 1, 4)x(5, 1, 4, 12) Log Likelihood -4051.335
Date: Tue, 05 Dec 2023 AIC 8140.670
Time: 11:44:09 BIC 8225.762
Sample: 01-02-2018 HQIC 8173.673
- 12-31-2019
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 -0.7871 3.786 -0.208 0.835 -8.208 6.634
ar.L2 -1.4560 1.628 -0.894 0.371 -4.647 1.735
ar.L3 -0.6356 4.779 -0.133 0.894 -10.002 8.731
ar.L4 -0.3750 0.461 -0.814 0.416 -1.278 0.528
ar.L5 -0.1582 1.049 -0.151 0.880 -2.215 1.899
ma.L1 -0.2308 3.740 -0.062 0.951 -7.561 7.099
ma.L2 0.3628 2.203 0.165 0.869 -3.956 4.682
ma.L3 -0.7487 2.011 -0.372 0.710 -4.689 3.192
ma.L4 -0.3139 3.606 -0.087 0.931 -7.381 6.753
ar.S.L12 0.0782 0.429 0.182 0.855 -0.763 0.919
ar.S.L24 -0.6564 0.376 -1.745 0.081 -1.394 0.081
ar.S.L36 -0.4114 0.477 -0.863 0.388 -1.346 0.523
ar.S.L48 -0.2989 0.247 -1.212 0.225 -0.782 0.184
ar.S.L60 -0.6227 0.317 -1.962 0.050 -1.245 -0.001
ma.S.L12 -0.8291 0.511 -1.621 0.105 -1.831 0.173
ma.S.L24 0.2475 0.704 0.351 0.725 -1.133 1.628
ma.S.L36 0.1426 0.627 0.227 0.820 -1.086 1.371
ma.S.L48 -0.1790 0.398 -0.450 0.653 -0.959 0.601
sigma2 3.368e+04 2979.215 11.306 0.000 2.78e+04 3.95e+04
===================================================================================
Ljung-Box (L1) (Q): 3.03 Jarque-Bera (JB): 1349.95
Prob(Q): 0.08 Prob(JB): 0.00
Heteroskedasticity (H): 2.05 Skew: -0.26
Prob(H) (two-sided): 0.00 Kurtosis: 10.04
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
plot_residual_analysis(sarima_results)
Normality Test: Chi-squared=81.63656135287913, p-value=1.8743295037205634e-18
def forecast_and_create_df(train, test, train_orig, test_orig, is_original_series=True):
"""
Forecast using ARIMA, Auto ARIMA, and SARIMA models and return a DataFrame with predictions.
Parameters:
- train: Training time series
- test: Test time series
- is_original_series: Flag indicating if the series is original (True) or differenced (False)
Returns:
- results_df: DataFrame with predictions
"""
# Combine train and test for simplicity
combined_series = pd.concat([train_orig['Total_AirTraffic_count'], test_orig['Total_AirTraffic_count']])
# Initialize results_df with the combined series
results_df = pd.DataFrame(combined_series, columns=['Total_AirTraffic_count'])
exog_train = train.drop(columns=['Airport','Origin_AirTraffic_count','Dest_AirTraffic_count','Total_AirTraffic_count'])
exog_test = test.drop(columns=['Airport','Origin_AirTraffic_count','Dest_AirTraffic_count','Total_AirTraffic_count'])
if is_original_series:
d = 0
else:
d = 1
# Fit ARIMA(3, 1, 1) model on the training data
arima_711 = ARIMA(train['Total_AirTraffic_count'], order=(3, d, 1)).fit()
# Fit Auto ARIMA model on the training data
auto_arima = pm.auto_arima(train['Total_AirTraffic_count'], seasonal=True, m=12, stepwise=True, suppress_warnings=True)
auto_arima_order = auto_arima.order
auto_arima_order_seasonal = auto_arima_order + (12,)
# Fit SARIMA model on the training data
sarima_503_12 = sm.tsa.statespace.SARIMAX(train['Total_AirTraffic_count'], order=auto_arima_order, seasonal_order=auto_arima_order_seasonal,
enforce_invertibility=False, enforce_stationarity=False, trend='n').fit()
if is_original_series:
sarimax_exog_order = (5,0,3)
else:
sarimax_exog_order = (5,0,3)
# Fit the SARIMAX(5, 0, 3) model with exogenous variables
print(sarimax_exog_order)
sarimax_exog = SARIMAX(
endog=train['Total_AirTraffic_count'],
exog=exog_train,
order=sarimax_exog_order,
enforce_invertibility=False,
enforce_stationarity=False,
trend='n'
).fit()
# Make predictions for the forecast period
y_pred_arima = arima_711.get_forecast(steps=len(test['Total_AirTraffic_count'])).predicted_mean.round().astype(int)
y_pred_auto_arima = np.round(auto_arima.predict(n_periods=len(test['Total_AirTraffic_count']))).astype(int)
y_pred_sarima = sarima_503_12.get_forecast(steps=len(test['Total_AirTraffic_count'])).predicted_mean.round().astype(int)
y_pred_sarimax_exog = sarimax_exog.get_forecast(steps=len(test['Total_AirTraffic_count']), exog=exog_test).predicted_mean
if not is_original_series:
y_pred_arima = np.cumsum(y_pred_arima) + test_orig['Total_AirTraffic_count'].iloc[0]
y_pred_auto_arima = np.cumsum(y_pred_auto_arima) + test_orig['Total_AirTraffic_count'].iloc[0]
y_pred_sarima = np.cumsum(y_pred_sarima) + test_orig['Total_AirTraffic_count'].iloc[0]
y_pred_sarimax_exog = np.cumsum(y_pred_sarimax_exog) + test_orig['Total_AirTraffic_count'].iloc[0]
# Update results_df with predictions
results_df['ARIMA_Predictions'] = np.nan
results_df.loc[test.index, 'ARIMA_Predictions'] = y_pred_arima
results_df['Auto_ARIMA_Predictions'] = np.nan
results_df.loc[test.index, 'Auto_ARIMA_Predictions'] = y_pred_auto_arima
results_df['SARIMA_Predictions'] = np.nan
results_df.loc[test.index, 'SARIMA_Predictions'] = y_pred_sarima
results_df['SARIMAX_exog_Predictions'] = np.nan
results_df.loc[test_orig.index, 'SARIMAX_exog_Predictions'] = y_pred_sarimax_exog
# Plot the original time series, test set, and predictions
fig, ax = plt.subplots(2, 1, figsize=(22, 10), sharex=False)
# Plot 1: Original Time Series, Test Set, and Predictions
ax[0].plot(results_df.index, results_df['Total_AirTraffic_count'], color='black', label='Original Series', linewidth=2)
if not is_original_series:
ax[0].plot(test_orig.index, test_orig['Total_AirTraffic_count'], linestyle='--', color='red', label='Test Set Original')
else:
ax[0].plot(test.index, test['Total_AirTraffic_count'], linestyle='--', color='red', label='Test Set Original')
ax[0].plot(results_df.index, results_df['ARIMA_Predictions'], color='green', label='ARIMA Predictions')
ax[0].plot(results_df.index, results_df['Auto_ARIMA_Predictions'], color='blue', label='Auto-ARIMA Predictions')
ax[0].plot(results_df.index, results_df['SARIMA_Predictions'], color='orange', label='SARIMA Predictions')
ax[0].plot(results_df.index, results_df['SARIMAX_exog_Predictions'], color='purple', label='SARIMA Predictions')
# Set x-axis limits based on the data
#start_date = results_df.index.min()
start_date = pd.to_datetime("2018-07-01", format='%Y-%m-%d')
end_date = results_df.index.max()
ax[0].set_xlim(start_date, end_date)
ax[0].set_ylabel('Number of Flights')
ax[0].set_title("Model Comparisons")
ax[0].legend()
# Plot 2: Predicted Time Frame Only
if not is_original_series:
ax[1].plot(test_orig.index, test_orig['Total_AirTraffic_count'], linestyle='--', color='red', label='Test Set Original', linewidth=3)
else:
ax[1].plot(test.index, test['Total_AirTraffic_count'], linestyle='--', color='red', label='Test Set Original', linewidth=3)
ax[1].plot(results_df.index, results_df['ARIMA_Predictions'], color='green', label='ARIMA Predictions')
ax[1].plot(results_df.index, results_df['Auto_ARIMA_Predictions'], color='blue', label='Auto-ARIMA Predictions')
ax[1].plot(results_df.index, results_df['SARIMA_Predictions'], color='orange', label='SARIMA Predictions')
ax[1].plot(results_df.index, results_df['SARIMAX_exog_Predictions'], color='purple', label='SARIMA Exog Predictions')
ax[1].set_xlabel('Date')
ax[1].set_ylabel('Number of Flights')
ax[1].set_title("Predicted Time Frame")
ax[1].legend()
# Set x-axis limits for the second subplot
ax[1].set_xlim(test.index.min(), test.index.max())
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
return results_df
daily_ap_cnt_df_split = daily_ap_cnt_df_filtered
train_orig = daily_ap_cnt_df_split[daily_ap_cnt_df_split.index < pd.to_datetime("2019-11-01", format='%Y-%m-%d')]
test_orig = daily_ap_cnt_df_split[daily_ap_cnt_df_split.index >= pd.to_datetime("2019-11-01", format='%Y-%m-%d')]
# Split the differenced series into training and test sets
train_diff = first_diff_df[first_diff_df.index < pd.to_datetime("2019-11-01")].dropna(inplace = False)
test_diff = first_diff_df[first_diff_df.index >= pd.to_datetime("2019-11-01")].dropna(inplace = False)
# Create models for original data and plot results
results_orig = forecast_and_create_df(train_orig, test_orig, train_orig, test_orig, is_original_series=True)
(5, 0, 3)
# Create models for differenced data and plot results
results_diff = forecast_and_create_df(train_diff, test_diff, train_orig, test_orig, is_original_series=False)
(5, 0, 3)
# Model results comparison for original data
model_results_comparison_orig = calculate_metrics(results_orig, test_col='Total_AirTraffic_count', pred_cols=None)
model_results_comparison_orig
| ARIMA_Predictions | Auto_ARIMA_Predictions | SARIMAX_exog_Predictions | SARIMA_Predictions | |
|---|---|---|---|---|
| MAPE | 13.733473 | 12.187771 | 8.603332 | 13.368576 |
| SMAPE | 12.776799 | 11.583008 | 9.268125 | 13.338992 |
# Model results comparison for differenced data
model_results_comparison_diff = calculate_metrics(results_diff, test_col='Total_AirTraffic_count', pred_cols=None)
model_results_comparison_diff
| ARIMA_Predictions | Auto_ARIMA_Predictions | SARIMAX_exog_Predictions | SARIMA_Predictions | |
|---|---|---|---|---|
| MAPE | 14.946106 | 12.827291 | 18.741870 | 11.709806 |
| SMAPE | 14.462437 | 11.550313 | 16.097013 | 10.699226 |
features = [ 'holiday_bool',
'Month_1', 'Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6',
'Month_7', 'Month_8', 'Month_9', 'Month_10', 'Month_11', 'Month_12',
'Holiday_4th of July', 'Holiday_Christmas Day','Holiday_Christmas Eve','Holiday_Columbus Day',
'Holiday_Eastern Easter','Holiday_Juneteenth', 'Holiday_Labor Day','Holiday_Labor Day Weekend',
'Holiday_Martin Luther King, Jr. Day','Holiday_Memorial Day',"Holiday_New Year's Day","Holiday_New Year's Eve",
'Holiday_Thanksgiving Day','Holiday_Thanksgiving Eve',"Holiday_Valentine's Day",'Holiday_Veterans Day',
"Holiday_Washington's Birthday",'Holiday_Western Easter', 'DayOfWeek_0',
'DayOfWeek_1', 'DayOfWeek_2', 'DayOfWeek_3', 'DayOfWeek_4', 'DayOfWeek_5',
'DayOfWeek_6', 'weekofyear_1', 'weekofyear_2', 'weekofyear_3', 'weekofyear_4', 'weekofyear_5', 'weekofyear_6',
'weekofyear_7', 'weekofyear_8', 'weekofyear_9', 'weekofyear_10', 'weekofyear_11', 'weekofyear_12',
'weekofyear_13', 'weekofyear_14', 'weekofyear_15', 'weekofyear_16', 'weekofyear_17', 'weekofyear_18',
'weekofyear_19', 'weekofyear_20', 'weekofyear_21', 'weekofyear_22', 'weekofyear_23', 'weekofyear_24',
'weekofyear_25', 'weekofyear_26', 'weekofyear_27', 'weekofyear_28', 'weekofyear_29', 'weekofyear_30',
'weekofyear_31', 'weekofyear_32', 'weekofyear_33', 'weekofyear_34', 'weekofyear_35', 'weekofyear_36',
'weekofyear_37', 'weekofyear_38', 'weekofyear_39', 'weekofyear_40', 'weekofyear_41', 'weekofyear_42',
'weekofyear_43', 'weekofyear_44', 'weekofyear_45', 'weekofyear_46', 'weekofyear_47', 'weekofyear_48',
'weekofyear_49', 'weekofyear_50', 'weekofyear_51', 'weekofyear_52']
X_train = train_orig[features]
y_train = train_orig['Total_AirTraffic_count']
X_test = test_orig[features]
y_test = test_orig['Total_AirTraffic_count']
best_reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree',
n_estimators=1500,
#early_stopping_rounds=50,
objective='reg:linear',
max_depth=2,
learning_rate=0.03,
n_jobs=-1,tree_method='hist')
best_reg.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=100)
[0] validation_0-rmse:855.12359 validation_1-rmse:868.16557 [100] validation_0-rmse:73.76627 validation_1-rmse:111.14404 [200] validation_0-rmse:50.50315 validation_1-rmse:91.71482 [300] validation_0-rmse:44.94912 validation_1-rmse:90.38057 [400] validation_0-rmse:41.82458 validation_1-rmse:89.75202 [500] validation_0-rmse:39.38206 validation_1-rmse:90.21338 [600] validation_0-rmse:37.99533 validation_1-rmse:90.57905 [700] validation_0-rmse:36.86766 validation_1-rmse:91.06883 [800] validation_0-rmse:36.20041 validation_1-rmse:91.25753 [900] validation_0-rmse:35.51057 validation_1-rmse:91.54557 [1000] validation_0-rmse:35.00506 validation_1-rmse:91.67583 [1100] validation_0-rmse:34.49264 validation_1-rmse:91.98362 [1200] validation_0-rmse:34.10893 validation_1-rmse:91.89410 [1300] validation_0-rmse:33.72622 validation_1-rmse:91.87891 [1400] validation_0-rmse:33.29380 validation_1-rmse:91.69393 [1499] validation_0-rmse:32.84379 validation_1-rmse:91.56259
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.03, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=2, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=1500, n_jobs=-1,
num_parallel_tree=None, objective='reg:linear', ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.03, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=2, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=1500, n_jobs=-1,
num_parallel_tree=None, objective='reg:linear', ...)# # Define the parameter grid to search
# param_grid = {
# 'max_depth': [2, 3, 5],
# 'learning_rate': [0.01, 0.1, 0.2, 0.3],
# 'n_estimators': [1000, 1250, 1500, 1750, 2000],
# 'subsample': [0.8, 0.9, 1.0],
# }
# # Create an XGBoost regressor
# reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree', objective='reg:linear')
# # Use GridSearchCV for hyperparameter tuning
# grid_search = GridSearchCV(reg, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2)
# grid_search.fit(X_train, y_train)
# # Print the best parameters
# print("Best Parameters:", grid_search.best_params_)
# # Get the best model
# best_reg = grid_search.best_estimator_
# # Fit the best model on the entire training set
# best_reg.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=100)
fi = pd.DataFrame(data=best_reg.feature_importances_,
index=best_reg.feature_names_in_,
columns=['importance'])
# Sort the DataFrame by importance and select the top 10
top_features = fi.sort_values('importance', ascending=False).head(40)
# Reverse the order to have the most important feature at the top
top_features = top_features[::-1]
# Increase plot size
fig, ax = plt.subplots(figsize=(12, 8))
top_features.plot(kind='barh', title='XGBoost Feature Importance', ax=ax)
plt.show()
test_orig['XGBoost_Predictions'] = best_reg.predict(X_test)
results_orig_final = results_orig.merge(test_orig[['XGBoost_Predictions']], how='left', left_index=True, right_index=True)
# Plot the original time series, test set, and predictions
fig, ax = plt.subplots(2, 1, figsize=(22, 10), sharex=False)
# Plot 1: Original Time Series, Test Set, and Predictions
ax[0].plot(results_orig_final.index, results_orig_final['Total_AirTraffic_count'], color='black', label='Original Series', linewidth=2)
ax[0].plot(test_orig.index, test_orig['Total_AirTraffic_count'], linestyle='--', color='red', label='Test Set Original')
ax[0].plot(results_orig_final.index, results_orig_final['ARIMA_Predictions'], color='green', label='ARIMA Predictions')
ax[0].plot(results_orig_final.index, results_orig_final['Auto_ARIMA_Predictions'], color='blue', label='Auto-ARIMA Predictions')
ax[0].plot(results_orig_final.index, results_orig_final['SARIMA_Predictions'], color='orange', label='SARIMA Predictions')
ax[0].plot(results_orig_final.index, results_orig_final['SARIMAX_exog_Predictions'], color='purple', label='SARIMA Predictions')
ax[0].plot(results_orig_final.index, results_orig_final['XGBoost_Predictions'], color='purple', label='XGBoost Predictions')
# Set x-axis limits based on the data
#start_date = results_df.index.min()
start_date = pd.to_datetime("2018-07-01", format='%Y-%m-%d')
end_date = results_orig_final.index.max()
ax[0].set_xlim(start_date, end_date)
ax[0].set_ylabel('Number of Flights')
ax[0].set_title("Model Comparisons")
ax[0].legend()
# Plot 2: Predicted Time Frame Only
ax[1].plot(test_orig.index, test_orig['Total_AirTraffic_count'], linestyle='--', color='red', label='Test Set Original', linewidth=3)
ax[1].plot(results_orig_final.index, results_orig_final['ARIMA_Predictions'], color='green', label='ARIMA Predictions')
ax[1].plot(results_orig_final.index, results_orig_final['Auto_ARIMA_Predictions'], color='blue', label='Auto-ARIMA Predictions')
ax[1].plot(results_orig_final.index, results_orig_final['SARIMA_Predictions'], color='orange', label='SARIMA Predictions')
ax[1].plot(results_orig_final.index, results_orig_final['SARIMAX_exog_Predictions'], color='purple', label='SARIMA Predictions')
ax[1].plot(results_orig_final.index, results_orig_final['XGBoost_Predictions'], color='purple', label='XGBoost Predictions')
ax[1].set_xlabel('Date')
ax[1].set_ylabel('Number of Flights')
ax[1].set_title("Predicted Time Frame")
ax[1].legend()
# Set x-axis limits for the second subplot
ax[1].set_xlim(test_orig.index.min(), test_orig.index.max())
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
model_results_comparison_orig = calculate_metrics(results_orig_final, test_col='Total_AirTraffic_count', pred_cols=None)
model_results_comparison_orig
| ARIMA_Predictions | Auto_ARIMA_Predictions | SARIMAX_exog_Predictions | SARIMA_Predictions | XGBoost_Predictions | |
|---|---|---|---|---|---|
| MAPE | 13.733473 | 12.187771 | 8.603332 | 13.368576 | 8.135046 |
| SMAPE | 12.776799 | 11.583008 | 9.268125 | 13.338992 | 8.489236 |